########################################################
# Project:    Talking to the Populist Radical Right
# Task:       The script produces the raw correspondence
#             analysis scores for the Dutch parties
# Author:     Jan Schwalbach (21/07/2022)
########################################################

# Loading packages and data
library(quanteda.textmodels)
library(quanteda.textplots)

library(quanteda)
library(RCurl)
library(XML)
library(stringr)
library(lubridate)
library(caret)
library(e1071)
library(plyr)
library(ggplot2)
library(dplyr)
library(readtext)
library(data.table)
library(wordshoal)
library(DT)
library(xtable)
library(zoo)
library(tm)

load(file="Corpus_Netherlands.Rdata")
corpusNL$CH <- as.numeric(corpusNL$CH)
corpusNL$text <- corpusNL$raw_text2
corpusNL <- corpusNL[corpusNL$CH > 0,]

# Deleting all speeches of parties that are not considered 
# in the analysis and the speaker.

corpusNL <- corpusNL[corpusNL$party != "other",]
corpusNL <- corpusNL[corpusNL$party != "NA",]
corpusNL <- corpusNL[corpusNL$voorzitter != T,]
corpusNL <- corpusNL[corpusNL$party != "GPV",]
corpusNL <- corpusNL[corpusNL$party != "PvdD",]
corpusNL <- corpusNL[corpusNL$party != "RPF",]
corpusNL <- corpusNL[corpusNL$party != "SGP",]
corpusNL <- corpusNL[corpusNL$party != "LPF",]
corpusNL <- corpusNL[!is.na(corpusNL$party),]

### Subsetting the data set

# To get the scores for each type of debate, select one of the three options.

# 1. All speeches: No changes needed

# 2. Immigration subsetting

corpusNL1<-corpusNL[!corpusNL$migrationcount<=2,]
corpusNL2<-corpusNL[!corpusNL$migrationcount1<1,]
corpusNL <- rbind(corpusNL1, corpusNL2) 
corpusNL <- corpusNL[!duplicated(corpusNL), ]
corpusNL<-corpusNL[!corpusNL$migrationcount<=0,]

# 3. Education subsetting

corpusNL1<-corpusNL[!corpusNL$educationcount<=2,]
corpusNL2<-corpusNL[!corpusNL$educationcount1<1,]
corpusNL <- rbind(corpusNL1, corpusNL2) 
corpusNL <- corpusNL[!duplicated(corpusNL), ]
#corpusNL <-corpusNL[!corpusNL$educationcount<=0,]

# Aggregating all speeches by a party on the debate level
library(plyr)
corpusNL$debateunique <- paste(corpusNL$sessiondate,corpusNL$debate)
corpusNL$debateunique<-removePunctuation(corpusNL$debateunique)
corpusNL$debateparty <- paste(corpusNL$partyquarter,corpusNL$debateunique)
corpusneu <- ddply(corpusNL, .(debateparty), summarize,
                   date=paste(date,collapse=","),
                   CH=paste(CH,collapse=","),
                   partyyear=paste(year,collapse=","),
                   partyquarter=paste(partyquarter,collapse=","),
                   debateunique=paste(debateunique,collapse=","),
                   text=paste(text,collapse=","),
                   year=paste(year,collapse=","),
                   quarter=paste(quarter,collapse=","),
                   type=paste(type,collapse=","),
                   party= paste(party,collapse=","))

corpusneu$party <- gsub("\\,.*","",corpusneu$party, fixed = FALSE)
corpusneu$type <- gsub("\\,.*","",corpusneu$type, fixed = FALSE)
corpusneu$CH <- gsub("\\,.*","",corpusneu$CH, fixed = FALSE)
corpusneu$date <- gsub("\\,.*","",corpusneu$date, fixed = FALSE)
corpusneu$partyyear <- gsub("\\,.*","",corpusneu$partyyear, fixed = FALSE)
corpusneu$debateunique <- gsub("\\,.*","",corpusneu$debateunique, fixed = FALSE)
corpusneu$year <- gsub("\\,.*","",corpusneu$year, fixed = FALSE)
corpusneu$quarter <- gsub("\\,.*","",corpusneu$quarter, fixed = FALSE)
corpusneu$partyquarter <- gsub("\\,.*","",corpusneu$partyyear, fixed = FALSE)
corpusneu$partyquarter <- paste(corpusneu$party,corpusneu$quarter)
corpusneu$partyyear <- paste(corpusneu$party,corpusneu$year)
corpusNL <- corpusneu
corpusNL <- as.data.frame(corpusNL)

# Keeping only debates when more than one party participated and 
# speeches with more than 50 words

corpusNL <-  corpusNL %>% group_by(debateunique) %>% filter(n()>2)
corpusNL <- as.data.frame(corpusNL)

corpusNL$length <- str_count(corpusNL$text,'\\w+')
corpusNL<-corpusNL[corpusNL$length>=50,]

# Dividing the corpus into legislative periods

corpusNL2006 <- corpusNL[corpusNL$CH > 0 & corpusNL$CH < 5,]
corpusNL2010 <- corpusNL[corpusNL$CH > 4 & corpusNL$CH < 7,]
corpusNL2012 <- corpusNL[corpusNL$CH > 6,]

### To get the scores for all/government/opposition debates,
### select one of the three options for the respective legislative debate

corpusNL2006g <- corpusNL2006[corpusNL2006$type == "government",]
corpusNL2006o <- corpusNL2006[corpusNL2006$type == "opposition",]

corpusNL2010g <- corpusNL2010[corpusNL2010$type == "government",]
corpusNL2010o <- corpusNL2010[corpusNL2010$type == "opposition",]

corpusNL2012g <- corpusNL2012[corpusNL2012$type == "government",]
corpusNL2012o <- corpusNL2012[corpusNL2012$type == "opposition",]

# Creating a dfm (removing punctuation, number, and stopwords)

corpusNLcorpus <- corpus(corpusNL2006)
corpusNLcorpus <- corpus(corpusNL2006g)
corpusNLcorpus <- corpus(corpusNL2006o)

corpusNLcorpus <- corpus(corpusNL2010)
corpusNLcorpus <- corpus(corpusNL2010g)
corpusNLcorpus <- corpus(corpusNL2010o)

corpusNLcorpus <- corpus(corpusNL2012)
corpusNLcorpus <- corpus(corpusNL2012g)
corpusNLcorpus <- corpus(corpusNL2012o)

NL_token <- tokens(corpusNLcorpus,
                       remove_punct = T,
                       remove_symbols = T,
                       remove_numbers = T
)

NL_token <- tokens_select(NL_token, pattern = stopwords("dutch"), selection = "remove")

dfm_NL <- dfm(NL_token)


dfm_NL <- dfm_wordstem(dfm_NL, language = "dutch")
#dfm_NL <- dfm_trim(dfm_NL, min_docfreq = 2)
dfm_NL <- dfm_group(dfm_NL, groups = party)


tmod_ca <- textmodel_ca(dfm_NL)

dat_ca <- data.frame(dim1 = coef(tmod_ca, doc_dim = 1)$coef_document, 
                     dim2 = coef(tmod_ca, doc_dim = 2)$coef_document)

allvalues <- dat_ca
allvalues$party <- tmod_ca$rownames
allvalues$party <- gsub(" [0-9][0-9][0-9][0-9].*", "", allvalues$party, fixed = FALSE)  
textplot_scale1d(tmod_ca)

# Plotting the respective values for the two most important dimensions

p <- ggplot(allvalues, aes(dim1,dim2))+ scale_fill_manual(values = c("seagreen4", "steelblue1", "green", "limegreen", "red", "blue4", "red4", "dodgerblue4")) +
  geom_point(color = 'red') +
  theme_classic(base_size = 20)+ 
  geom_label_repel(aes(label = party,
                       fill = party), color = 'white',
                   size = 5) + theme(legend.position = "none")+ labs(title = "Debates Netherlands", y="Dimension 2", x = "Dimension 1")
p

### Creating data sets for the analysis for:

# all debates

CA_NL_all <- allvalues
names(CA_NL_all)[1] <- "CA_NL_all_D1"
names(CA_NL_all)[2] <- "CA_NL_all_D2"

CA_NL_all_gov <- allvalues
names(CA_NL_all_gov)[1] <- "CA_NL_all_gov_D1"
names(CA_NL_all_gov)[2] <- "CA_NL_all_gov_D2"

CA_NL_all_op <- allvalues
names(CA_NL_all_op)[1] <- "CA_NL_all_op_D1"
names(CA_NL_all_op)[2] <- "CA_NL_all_op_D2"

Positions_CA_NL_ALL <- cbind(CA_NL_all,CA_NL_all_gov,CA_NL_all_op)
save(Positions_CA_NL_ALL, file = "CA_NL_ALL.Rdata") # save corpus
save(Positions_CA_NL_ALL, file = "CA_NL_ALL_10.Rdata") # save corpus
save(Positions_CA_NL_ALL, file = "CA_NL_ALL_12.Rdata") # save corpus

# Immigration debates

CA_NL_MIG <- allvalues
names(CA_NL_MIG)[1] <- "CA_NL_MIG_D1"
names(CA_NL_MIG)[2] <- "CA_NL_MIG_D2"

CA_NL_MIG_gov <- allvalues
names(CA_NL_MIG_gov)[1] <- "CA_NL_MIG_gov_D1"
names(CA_NL_MIG_gov)[2] <- "CA_NL_MIG_gov_D2"

CA_NL_MIG_op <- allvalues
names(CA_NL_MIG_op)[1] <- "CA_NL_MIG_op_D1"
names(CA_NL_MIG_op)[2] <- "CA_NL_MIG_op_D2"

Positions_CA_NL_MIG <- cbind(CA_NL_MIG,CA_NL_MIG_gov,CA_NL_MIG_op)
save(Positions_CA_NL_MIG, file = "CA_NL_MIG.Rdata") # save corpus
save(Positions_CA_NL_MIG, file = "CA_NL_MIG_10.Rdata") # save corpus
save(Positions_CA_NL_MIG, file = "CA_NL_MIG_12.Rdata") # save corpus

# Education debates

CA_NL_ED <- allvalues
names(CA_NL_ED)[1] <- "CA_NL_ED_D1"
names(CA_NL_ED)[2] <- "CA_NL_ED_D2"

CA_NL_ED_gov <- allvalues
names(CA_NL_ED_gov)[1] <- "CA_NL_ED_gov_D1"
names(CA_NL_ED_gov)[2] <- "CA_NL_ED_gov_D2"

CA_NL_ED_op <- allvalues
names(CA_NL_ED_op)[1] <- "CA_NL_ED_op_D1"
names(CA_NL_ED_op)[2] <- "CA_NL_ED_op_D2"

Positions_CA_NL_ED <- cbind(CA_NL_ED,CA_NL_ED_gov,CA_NL_ED_op)
save(Positions_CA_NL_ED, file = "CA_NL_ED.Rdata") # save corpus
save(Positions_CA_NL_ED, file = "CA_NL_ED_10.Rdata") # save corpus
save(Positions_CA_NL_ED, file = "CA_NL_ED_12.Rdata") # save corpus
